import numpy as np
from scipy import stats
import plotly.graph_objects as go
from thompson_sampling import SlotMachine, ThompsonSampling
Here I use the ThompsonSampling() and SlotMachine() classes for simulating a Thompson Sampling run for 10k steps.
slots = [SlotMachine(p, seed=42+seed) for seed, p in enumerate([0.1, 0.4, 0.46, 0.6, 0.61])]
n_slots = len(slots)
sampling = ThompsonSampling(slots, seed=1)
n_steps = 10_000
alphas = np.zeros([n_steps, n_slots])
betas = np.zeros([n_steps, n_slots])
means = np.zeros([n_steps, n_slots])
stds = np.zeros([n_steps, n_slots])
regret = np.zeros(n_steps)
for step in range(n_steps):
chosen_slot, _ = sampling.act()
alphas[step] = sampling.alphas
betas[step] = sampling.betas
means[step] = sampling.get_means()
stds[step] = sampling.get_stds()
regret[step] = 0.61 - slots[chosen_slot].p
Here I plot the posterior distributions for the 5 slot machines, the time axis is literally the time, being the plot an interactive animation.
(Probably best viewed as html file in output/distributions_evolution.html)
# Compute the beta distributions
x = np.linspace(0, 1, 201)
frames = []
for step in range(0, n_steps, 100):
frames.append(
[stats.beta.pdf(x, alphas[step][slot], betas[step][slot]) for slot in range(n_slots)]
)
# Plot
fig = go.Figure(
data=[go.Scatter(x=x, y=y, fill='tozeroy', name=f'Slot {slot}') for slot, y in enumerate(frames[-1])],
layout=go.Layout(
xaxis=dict(range=[0, 1], autorange=False),
yaxis=dict(range=[0, 12], autorange=False),
title='Step 0',
updatemenus=[{
'buttons': [{
'args': [None, {'frame': {'duration': 50}}],
'label': '▶',
'method': 'animate',
}],
'direction': 'left',
'pad': {'r': 10, 't': 40},
'type': 'buttons',
'x': 0.1,
'y': 0,
}
],
sliders = [
{
'len': 0.9,
'x': 0.1,
'y': 0,
'steps': [{
'args': [[f'{step}'], {'frame': {'duration': 0}, 'mode': 'immediate'}],
'label': f'{step}',
'method': 'animate',
} for step, _ in enumerate(frames)
],
}
]
),
frames=[
go.Frame(
data=[go.Scatter(x=x, y=y, fill='tozeroy', name=f'Slot {slot}') for slot, y in enumerate(frame)],
layout=go.Layout(title_text=f'Step {step * 100}'),
name=f'{step}',
)
for step, frame in enumerate(frames)
]
)
fig.update_layout(height=700)
fig.show(renderer='notebook_connected')
fig.write_html('output/distributions_evolution.html')
Here I plot the means and stds of the beta distributions over time.
The best point estimate at each point in time is the mean with highest value. The stds give a measure of the confidence of the agent.
(p on the y axis, time on the x axis)
(Probably best viewed as html file in output/distributions_timeline.html)
colors = ['rgba(99,110,250', 'rgba(239,85,59', 'rgba(0,204,150', 'rgba(171,99,250', 'rgba(255,161,90']
fig = go.Figure()
for slot in range(n_slots):
fig.add_trace(go.Scatter(
y=means[:,slot],
stackgroup=slot,
line={'color': colors[slot] + ',1)'},
fill='none',
name=f'Slot {slot}',
legendgroup=slot,
))
fig.add_trace(go.Scatter(
y=stds[:,slot],
fill='tonexty',
stackgroup=slot,
mode='lines',
line={'width': 0.8, 'color': colors[slot] + ',1)', 'dash': 'dot'},
fillcolor='rgba(0,0,0,0)',
showlegend=False,
legendgroup=slot,
))
fig.add_trace(go.Scatter(
y=-stds[:,slot] * 2,
fill='tonexty',
stackgroup=slot,
mode='lines',
line={'width': 0.8, 'color': colors[slot] + ',1)', 'dash': 'dot'},
fillcolor=colors[slot] + ',0.12)',
showlegend=False,
legendgroup=slot,
))
fig.update_layout(height=600)
fig.show(renderer='notebook_connected')
fig.write_html('output/distributions_timeline.html')
fig = go.Figure()
fig.add_trace(go.Scatter(
y=np.cumsum(regret),
name='Regret',
))
fig.show(renderer='notebook_connected')
fig.write_html('output/regret.html')
Not very well.
The algorithm focus most of the time on the best(s) slot(s) and converge to a good estimate for these, but it neglects the others slots and it doesn't arrive to a good estimate for those.
The slots with lower probabilities are neglected because of the exploration-exploitation tradeoff. When the agent is confident enough that the best slot is not between one of these, it focuses almost only on the best ones, exploiting them.
In order to explore more we could just use a uniform distribution instead of a beta distribution (or just cycle through the slots). Doing so we would explore much more and we would converge to better estimates (but we would exploit less).